library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.8
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(coefplot)
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
library(visdat)
library(yardstick)
## For binary classification, the first factor level is assumed to be the event.
## Use the argument `event_level = "second"` to alter this as needed.
## 
## Attaching package: 'yardstick'
## The following objects are masked from 'package:caret':
## 
##     precision, recall, sensitivity, specificity
## The following object is masked from 'package:readr':
## 
##     spec
df_all <- readr::read_csv("final_project_bonus.csv", col_names = TRUE)
## Rows: 1325 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): region, customer, outcome
## dbl (35): rowid, xb_01, xb_02, xb_03, xn_01, xn_02, xn_03, xa_01, xa_02, xa_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_all %>% summary()
##      rowid         region            customer             xb_01       
##  Min.   :   1   Length:1325        Length:1325        Min.   :-4.000  
##  1st Qu.: 332   Class :character   Class :character   1st Qu.: 2.500  
##  Median : 663   Mode  :character   Mode  :character   Median : 3.345  
##  Mean   : 663                                         Mean   : 3.427  
##  3rd Qu.: 994                                         3rd Qu.: 4.200  
##  Max.   :1325                                         Max.   :15.000  
##      xb_02            xb_03            xn_01            xn_02       
##  Min.   :-4.000   Min.   :-7.000   Min.   :-4.000   Min.   :-4.000  
##  1st Qu.: 3.000   1st Qu.:-1.000   1st Qu.: 1.000   1st Qu.: 2.000  
##  Median : 6.000   Median : 1.000   Median : 1.654   Median : 4.000  
##  Mean   : 5.762   Mean   : 1.305   Mean   : 1.629   Mean   : 3.723  
##  3rd Qu.: 8.000   3rd Qu.: 3.000   3rd Qu.: 2.455   3rd Qu.: 6.000  
##  Max.   :16.000   Max.   :15.000   Max.   :10.000   Max.   :14.000  
##      xn_03           xa_01            xa_02           xa_03        
##  Min.   :-7.00   Min.   :-3.000   Min.   :-3.00   Min.   :-12.000  
##  1st Qu.:-2.00   1st Qu.: 6.000   1st Qu.: 8.00   1st Qu.:  0.000  
##  Median : 0.00   Median : 8.000   Median :13.00   Median :  3.000  
##  Mean   :-0.28   Mean   : 8.087   Mean   :13.16   Mean   :  3.888  
##  3rd Qu.: 1.00   3rd Qu.: 9.857   3rd Qu.:18.00   3rd Qu.:  7.000  
##  Max.   :10.00   Max.   :35.000   Max.   :38.00   Max.   : 35.000  
##      xb_04             xb_05             xb_06            xb_07       
##  Min.   :-2.0000   Min.   :-5.0000   Min.   :-2.000   Min.   :-2.000  
##  1st Qu.: 0.8991   1st Qu.:-0.3333   1st Qu.: 1.167   1st Qu.: 1.667  
##  Median : 1.1474   Median : 0.5000   Median : 2.000   Median : 2.000  
##  Mean   : 1.1860   Mean   : 0.4525   Mean   : 2.134   Mean   : 2.093  
##  3rd Qu.: 1.4280   3rd Qu.: 1.0000   3rd Qu.: 3.000   3rd Qu.: 2.400  
##  Max.   : 8.0000   Max.   : 8.0000   Max.   :11.000   Max.   : 8.000  
##      xb_08             xn_04             xn_05             xn_06       
##  Min.   :-4.0000   Min.   :-4.0000   Min.   :-4.0000   Min.   :-4.000  
##  1st Qu.:-0.1923   1st Qu.: 0.3333   1st Qu.:-1.0000   1st Qu.: 0.800  
##  Median : 0.2500   Median : 0.6292   Median : 0.0000   Median : 1.250  
##  Mean   : 0.2795   Mean   : 0.6326   Mean   :-0.1031   Mean   : 1.485  
##  3rd Qu.: 1.0000   3rd Qu.: 1.0000   3rd Qu.: 0.6667   3rd Qu.: 2.000  
##  Max.   : 8.0000   Max.   : 5.0000   Max.   : 5.0000   Max.   :12.000  
##      xn_07            xn_08             xa_04            xa_05       
##  Min.   :-4.000   Min.   :-4.0000   Min.   :-2.000   Min.   :-8.000  
##  1st Qu.: 1.000   1st Qu.:-1.0000   1st Qu.: 2.250   1st Qu.: 0.000  
##  Median : 1.404   Median :-0.2700   Median : 2.875   Median : 1.500  
##  Mean   : 1.439   Mean   :-0.2356   Mean   : 2.947   Mean   : 1.403  
##  3rd Qu.: 1.862   3rd Qu.: 0.1250   3rd Qu.: 3.483   3rd Qu.: 2.667  
##  Max.   : 7.000   Max.   : 5.0000   Max.   :14.000   Max.   :14.000  
##      xa_06            xa_07            xa_08            xw_01       
##  Min.   :-2.000   Min.   :-2.000   Min.   :-5.000   Min.   :  7.00  
##  1st Qu.: 3.000   1st Qu.: 3.682   1st Qu.: 0.500   1st Qu.: 44.50  
##  Median : 4.250   Median : 4.554   Median : 1.186   Median : 57.00  
##  Mean   : 5.104   Mean   : 4.656   Mean   : 1.241   Mean   : 56.94  
##  3rd Qu.: 6.500   3rd Qu.: 5.333   3rd Qu.: 2.000   3rd Qu.: 68.18  
##  Max.   :25.000   Max.   :17.000   Max.   :14.000   Max.   :108.00  
##      xw_02            xw_03            xs_01             xs_02         
##  Min.   :  0.00   Min.   :  7.00   Min.   :-0.3612   Min.   :-0.89585  
##  1st Qu.:  9.00   1st Qu.: 59.00   1st Qu.: 0.1539   1st Qu.:-0.13251  
##  Median : 25.00   Median : 92.00   Median : 0.2190   Median : 0.04224  
##  Mean   : 32.54   Mean   : 78.62   Mean   : 0.2216   Mean   : 0.03560  
##  3rd Qu.: 52.00   3rd Qu.:101.00   3rd Qu.: 0.2845   3rd Qu.: 0.20362  
##  Max.   :108.00   Max.   :114.00   Max.   : 0.9979   Max.   : 0.99793  
##      xs_03             xs_04            xs_05             xs_06       
##  Min.   :-0.3612   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.: 0.2444   1st Qu.:0.2355   1st Qu.:0.07365   1st Qu.:0.2809  
##  Median : 0.3801   Median :0.2857   Median :0.16265   Median :0.4149  
##  Mean   : 0.4273   Mean   :0.2945   Mean   :0.18671   Mean   :0.4565  
##  3rd Qu.: 0.5943   3rd Qu.:0.3377   3rd Qu.:0.25991   3rd Qu.:0.5901  
##  Max.   : 1.7907   Max.   :1.0342   Max.   :1.03416   Max.   :1.4066  
##     response         outcome         
##  Min.   : 0.3367   Length:1325       
##  1st Qu.: 1.5868   Class :character  
##  Median : 2.2547   Mode  :character  
##  Mean   : 2.7436                     
##  3rd Qu.: 3.3418                     
##  Max.   :40.8012
df_all %>% ggplot(mapping=aes(x=customer)) + geom_bar()

df_all %>% ggplot(mapping=aes(x=region)) + geom_bar()

df_all %>% ggplot(mapping=aes(x=outcome)) + geom_bar()

As you can see the data is massively imbalanced. Customer S and U barely account for anything, and the ratio of event to non-event is quite dramatic.

Lets see if we have any missing data and what it looks like.

visdat::vis_miss(df_all, cluster=TRUE) +
  theme(axis.text.x = element_text(size = 6.5, angle = 90))
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

Well that is some good news, nothing is missing. Lets look at customer via proportion

df_all %>% 
  mutate(customer = forcats::fct_infreq(customer)) %>% 
  ggplot(mapping = aes(x = customer, y = stat(prop), group = 1)) +
  geom_bar() +
  coord_flip() +
  labs(x = "") +
  theme_bw()

Confirms what we already know, and verifies that G is also much higher than the rest. But lets look at our two categorical together in combination.

df_all %>% 
  mutate(customer = forcats::fct_lump_prop(customer, 0.05),
         region = forcats::fct_lump_prop(region, 0.05)) %>% 
  count(customer, region) %>% 
  mutate(prop_total = n / sum(n)) %>% 
  ggplot(mapping = aes(x = customer, y = region)) +
  geom_tile(mapping = aes(fill = cut(prop_total,
                                     breaks = seq(0, 0.18, by = 0.03))),
            color = "black") +
  geom_text(mapping = aes(label = signif(prop_total, 3),
                          color = prop_total < 0.09)) +
  scale_fill_viridis_d("Proportion") +
  scale_color_manual(guide = 'none',
                     values = c("TRUE" = "white", 
                                "FALSE" = "black")) +
  theme_bw()

As you would suspect, not all customer groups exist in all regions. So an interaction between the two may not be the best idea. We should keep that in mind. The proportions are all wacky as well.

Even though we’ve already ‘explored’ the data, lets look at it again in relation to the outcome.

df_y <- df_all %>% mutate(y = ifelse(outcome == "event", 1, 0))
df_y %>% select(starts_with("xa"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xa"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xb"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xb"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xn"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xn"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xw"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xw"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

df_y %>% select(starts_with("xs"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=region)) +
  geom_jitter(height = 0.04) +
  facet_grid(region~name, scales = 'free')

df_y %>% select(starts_with("xs"), region, customer, response, y)  %>%  rowid_to_column()  %>% pivot_longer(!c("rowid", "region", "customer", "response", "y")) %>% 
  ggplot(mapping = aes(x=value, y=y, alpha=0.1, color=customer)) +
  geom_jitter(height = 0.04) +
  facet_grid(customer~name, scales = 'free')

Customers S and U have just a couple of data points for some of the features…

Lets make a model without accounting for class imbalance for now, so we can compare. Just a basic all additive.

my_ctrl <- trainControl(method = 'cv', number = 5,
                             summaryFunction = twoClassSummary,
                             classProbs = TRUE,
                             savePredictions = TRUE)
my_metric <- "ROC"
default_model_all_add <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(response) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors())

default_model_all_add %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names()
##  [1] "rowid"      "xb_01"      "xb_02"      "xb_03"      "xn_01"     
##  [6] "xn_02"      "xn_03"      "xa_01"      "xa_02"      "xa_03"     
## [11] "xb_04"      "xb_05"      "xb_06"      "xb_07"      "xb_08"     
## [16] "xn_04"      "xn_05"      "xn_06"      "xn_07"      "xn_08"     
## [21] "xa_04"      "xa_05"      "xa_06"      "xa_07"      "xa_08"     
## [26] "xw_01"      "xw_02"      "xw_03"      "xs_01"      "xs_02"     
## [31] "xs_03"      "xs_04"      "xs_05"      "xs_06"      "outcome"   
## [36] "region_YY"  "region_ZZ"  "customer_B" "customer_C" "customer_D"
## [41] "customer_E" "customer_F" "customer_G" "customer_H" "customer_I"
## [46] "customer_J" "customer_K" "customer_L" "customer_M" "customer_N"
## [51] "customer_O" "customer_P" "customer_Q" "customer_R" "customer_S"
## [56] "customer_U"

Lets train, but we will use elastic net at least

set.seed(98123)
fit_glm_add_all <- train(default_model_all_add, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: There are new levels in a factor: S
## There are new levels in a factor: S
## There are new levels in a factor: S
## Warning: There are new levels in a factor: U
## There are new levels in a factor: U
## There are new levels in a factor: U
fit_glm_add_all
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, center, scale, dummy 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens  Spec     
##   0.8306414  0.24  0.9710165

Without a tuning grid, it didn’t perform that badly compared to the ‘clean’ data we worked with. But lets do some interactions with region and customer

default_model_region_X <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(customer, response) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact(~starts_with("region"):starts_with("x"))

default_model_region_X %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names()
##   [1] "rowid"             "xb_01"             "xb_02"            
##   [4] "xb_03"             "xn_01"             "xn_02"            
##   [7] "xn_03"             "xa_01"             "xa_02"            
##  [10] "xa_03"             "xb_04"             "xb_05"            
##  [13] "xb_06"             "xb_07"             "xb_08"            
##  [16] "xn_04"             "xn_05"             "xn_06"            
##  [19] "xn_07"             "xn_08"             "xa_04"            
##  [22] "xa_05"             "xa_06"             "xa_07"            
##  [25] "xa_08"             "xw_01"             "xw_02"            
##  [28] "xw_03"             "xs_01"             "xs_02"            
##  [31] "xs_03"             "xs_04"             "xs_05"            
##  [34] "xs_06"             "outcome"           "region_YY"        
##  [37] "region_ZZ"         "region_YY_x_xb_01" "region_YY_x_xb_02"
##  [40] "region_YY_x_xb_03" "region_YY_x_xn_01" "region_YY_x_xn_02"
##  [43] "region_YY_x_xn_03" "region_YY_x_xa_01" "region_YY_x_xa_02"
##  [46] "region_YY_x_xa_03" "region_YY_x_xb_04" "region_YY_x_xb_05"
##  [49] "region_YY_x_xb_06" "region_YY_x_xb_07" "region_YY_x_xb_08"
##  [52] "region_YY_x_xn_04" "region_YY_x_xn_05" "region_YY_x_xn_06"
##  [55] "region_YY_x_xn_07" "region_YY_x_xn_08" "region_YY_x_xa_04"
##  [58] "region_YY_x_xa_05" "region_YY_x_xa_06" "region_YY_x_xa_07"
##  [61] "region_YY_x_xa_08" "region_YY_x_xw_01" "region_YY_x_xw_02"
##  [64] "region_YY_x_xw_03" "region_YY_x_xs_01" "region_YY_x_xs_02"
##  [67] "region_YY_x_xs_03" "region_YY_x_xs_04" "region_YY_x_xs_05"
##  [70] "region_YY_x_xs_06" "region_ZZ_x_xb_01" "region_ZZ_x_xb_02"
##  [73] "region_ZZ_x_xb_03" "region_ZZ_x_xn_01" "region_ZZ_x_xn_02"
##  [76] "region_ZZ_x_xn_03" "region_ZZ_x_xa_01" "region_ZZ_x_xa_02"
##  [79] "region_ZZ_x_xa_03" "region_ZZ_x_xb_04" "region_ZZ_x_xb_05"
##  [82] "region_ZZ_x_xb_06" "region_ZZ_x_xb_07" "region_ZZ_x_xb_08"
##  [85] "region_ZZ_x_xn_04" "region_ZZ_x_xn_05" "region_ZZ_x_xn_06"
##  [88] "region_ZZ_x_xn_07" "region_ZZ_x_xn_08" "region_ZZ_x_xa_04"
##  [91] "region_ZZ_x_xa_05" "region_ZZ_x_xa_06" "region_ZZ_x_xa_07"
##  [94] "region_ZZ_x_xa_08" "region_ZZ_x_xw_01" "region_ZZ_x_xw_02"
##  [97] "region_ZZ_x_xw_03" "region_ZZ_x_xs_01" "region_ZZ_x_xs_02"
## [100] "region_ZZ_x_xs_03" "region_ZZ_x_xs_04" "region_ZZ_x_xs_05"
## [103] "region_ZZ_x_xs_06"
set.seed(98123)
fit_glm_region_X <- train(default_model_region_X, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)

fit_glm_region_X
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, center, scale, dummy, interact 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens  Spec    
##   0.7708936  0.28  0.933617

Here we start to see a much lower ROC when we look at region interaction. But lets check out customer as well

default_model_customer_X <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(region, response) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact(~starts_with("customer"):starts_with("x"))

default_model_customer_X %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
fit_glm_customer_X <- train(default_model_customer_X, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
fit_glm_customer_X
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, center, scale, dummy, interact 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC       Sens  Spec     
##   0.576055  0.4   0.7552973

Notice we get a lot of warnings when running this model, because some of the factors are sparse like S.

We see the same diminished ROC values. We will use a new package that will help us with the upsampling in recipe called themis.

library(themis)
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample

First lets see if we can make improvements with lumping customers. We will lump everything with less than 5% proportion which will grab the two lowest.

lump_model_customer_X <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(region, response) %>% 
  step_other(customer, threshold = 0.05) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact(~starts_with("customer"):starts_with("x"))

lump_model_customer_X %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_other_x_xs_01" "customer_other_x_xs_02" "customer_other_x_xs_03"
## [4] "customer_other_x_xs_04" "customer_other_x_xs_05" "customer_other_x_xs_06"
set.seed(98123)
lump_glm_customer_X <- train(lump_model_customer_X, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
lump_glm_customer_X
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, other, center, scale, dummy, interact 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.7082979  0.4466667  0.8689362

Now lets try with upsampling instead. We will upsample the lower class to 50%, so that it is ‘picked’ more often. We will try again with full 100

upsample_50_model_customer_X <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(region, response) %>% 
  step_upsample(customer, over_ratio = 0.5) %>%
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact(~starts_with("customer"):starts_with("x"))

upsample_50_model_customer_X %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
up_50_glm_customer_X <- train(upsample_50_model_customer_X, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
up_50_glm_customer_X
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, upsample, center, scale, dummy, interact 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC       Sens  Spec     
##   0.591645  0.42  0.7297181
upsample_100_model_customer_X <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(region, response) %>% 
  step_upsample(customer, over_ratio = 1) %>%
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact(~starts_with("customer"):starts_with("x"))

upsample_100_model_customer_X %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
up_100_glm_customer_X <- train(upsample_100_model_customer_X, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
up_100_glm_customer_X
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, upsample, center, scale, dummy, interact 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.5673054  0.3466667  0.7630296

Now lets take a look at the near zero variance features. It will remove sparse features are ones that are highly imbalanced.

nzv_model_customer_X <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(region, response) %>% 
  step_nzv(all_predictors()) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact(~starts_with("customer"):starts_with("x"))

nzv_model_customer_X %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_U_x_xs_01" "customer_U_x_xs_02" "customer_U_x_xs_03"
## [4] "customer_U_x_xs_04" "customer_U_x_xs_05" "customer_U_x_xs_06"
set.seed(98123)
nzv_glm_customer_X <- train(nzv_model_customer_X, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
nzv_glm_customer_X
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, nzv, center, scale, dummy, interact 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC       Sens  Spec     
##   0.576055  0.4   0.7552973

Now that we have some powerful tools to help deal with imbalances, lets see what happens when we interact region AND customers. Lets try with near zero variance. There will be some features with no values, so even upsampling wont really work that well. But we can try near zero and see what happens.

nzv_model_customer_X_region <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(response) %>% 
  step_nzv(all_predictors()) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) %>% 
  step_interact(~starts_with("customer"):starts_with("region"):starts_with("x")) 


nzv_model_customer_X_region %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_U_x_region_ZZ_x_xs_01" "customer_U_x_region_ZZ_x_xs_02"
## [3] "customer_U_x_region_ZZ_x_xs_03" "customer_U_x_region_ZZ_x_xs_04"
## [5] "customer_U_x_region_ZZ_x_xs_05" "customer_U_x_region_ZZ_x_xs_06"
set.seed(98123)
nzv_glm_customer_X_region <- train(nzv_model_customer_X_region, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: There are new levels in a factor: U
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
nzv_glm_customer_X_region
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, nzv, center, scale, dummy, interact 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.6020213  0.3133333  0.8150536

Last but not least, lets just do all of the same sampling techniques for all additive, so we can compare some apples to apples.

up_50_model_all_add <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(response) %>% 
  step_upsample(customer, over_ratio = .5) %>%
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) 
  


up_50_model_all_add %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_O" "customer_P" "customer_Q" "customer_R" "customer_S"
## [6] "customer_U"
set.seed(98123)
up_50_glm_all_add <- train(up_50_model_all_add, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: There are new levels in a factor: S
## There are new levels in a factor: S
## There are new levels in a factor: S
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: There are new levels in a factor: U
## There are new levels in a factor: U
## There are new levels in a factor: U
up_50_glm_all_add
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, upsample, center, scale, dummy 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8093478  0.2533333  0.9573741
nzv_model_all_add <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(response) %>% 
  step_nzv(all_predictors()) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) 
  


nzv_model_all_add %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_O" "customer_P" "customer_Q" "customer_R" "customer_S"
## [6] "customer_U"
set.seed(98123)
nzv_glm_all_add <- train(nzv_model_all_add, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)
## Warning: There are new levels in a factor: S
## There are new levels in a factor: S
## There are new levels in a factor: S
## Warning: There are new levels in a factor: U
## There are new levels in a factor: U
## There are new levels in a factor: U
nzv_glm_all_add
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, nzv, center, scale, dummy 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens  Spec     
##   0.8306414  0.24  0.9710165

Lets take a look at the results using visualizations.

all_cv_summary <- resamples(list(DEFAULT_ALL_ADD = fit_glm_add_all,
                                  DEFAULT_REGION_X = fit_glm_region_X,
                                  DEFAULT_CUSTOMER_X = fit_glm_customer_X,
                                  LUMP_OTHER_CUST_X = lump_glm_customer_X,
                                  UP_50_CUST_X = up_50_glm_customer_X,
                                  UP_100_CUST_X = up_100_glm_customer_X,
                                  NZV_CUSTOMER_X = nzv_glm_customer_X,
                                  NZV_CUST_X_REGION_X = nzv_glm_customer_X_region,
                                  UP_50_ALL_ADD = up_50_glm_all_add,
                                  NZV_ALL_ADD = nzv_glm_all_add
                                 ))


dotplot(all_cv_summary, metric = 'ROC')

Judging by the graphic up top, when it comes all additive, my upsample, or nzv performed about the same as the default, which is disappointing. When looking at customer interactions however, the significant improvement came from lumping only. Which does make some sense, since that is what the original data we’ve been working with was. The data was ‘pre-lumped’ into other, and ultimately that is what worked best. Which means I should try lumping all additive below.

lump_model_all_add <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(response) %>% 
  step_other(customer, threshold = 0.05) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) 

lump_model_all_add %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_E"     "customer_G"     "customer_K"     "customer_M"    
## [5] "customer_Q"     "customer_other"
set.seed(98123)
lump_glm_all_add <- train(lump_model_all_add, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)

lump_glm_all_add
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, other, center, scale, dummy 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8347801  0.2333333  0.9719149

Lets try lump with nzv…

lump_model_all_add_nzv <- recipe(outcome ~ .,
                       data = df_all) %>% 
  step_rm(response) %>% 
  step_nzv(all_numeric_predictors()) %>% 
  step_other(customer, threshold = 0.05) %>% 
  step_center(all_numeric_predictors()) %>% 
  step_scale(all_numeric_predictors()) %>% 
  step_dummy(all_nominal_predictors()) 

lump_model_all_add_nzv %>% 
  prep(training = df_all, retain = TRUE) %>% 
  bake(new_data = NULL) %>% 
  names() %>% tail()
## [1] "customer_E"     "customer_G"     "customer_K"     "customer_M"    
## [5] "customer_Q"     "customer_other"
set.seed(98123)
lump_glm_all_add_nzv <- train(lump_model_all_add_nzv, data = df_all,
                 method = "glm",
                 metric = my_metric,
                 trControl = my_ctrl)

lump_glm_all_add_nzv
## Generalized Linear Model 
## 
## 1325 samples
##   37 predictor
##    2 classes: 'event', 'non_event' 
## 
## Recipe steps: rm, nzv, other, center, scale, dummy 
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1060, 1060, 1060, 1060, 1060 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8347801  0.2333333  0.9719149
all_cv_summary_2 <- resamples(list(DEFAULT_ALL_ADD = fit_glm_add_all,
                                  LUMP_OTHER_ALL_ADD = lump_glm_all_add,
                                  LUMP_OTHER_ALL_ADD_NZV = lump_glm_all_add_nzv
                                  
                                 ))

dotplot(all_cv_summary_2, metric = 'ROC')

Well we see that lump and NZV doesn’t work too well. Lets take a look at the predictions

model_pred_results <- fit_glm_add_all$pred %>% tibble::as_tibble() %>% 
  select(pred, obs, event, non_event, rowIndex, Resample) %>% 
  mutate(model_name = "DEFAULT_ADD_ALL") %>% 
  bind_rows(lump_glm_all_add$pred %>% tibble::as_tibble() %>% 
  select(pred, obs, event, non_event, rowIndex, Resample) %>% 
  mutate(model_name = "LUMP_ADD_ALL")) %>% 
  bind_rows(lump_glm_all_add_nzv$pred %>% tibble::as_tibble() %>% 
              select(pred, obs, event, non_event, rowIndex, Resample) %>% 
              mutate(model_name = "NZV_LUMP_ADD_ALL"))
model_pred_results %>% 
  group_by(model_name) %>% 
  roc_curve(obs, event) %>% 
  autoplot()

This confirms what we already know. I would facet by customer, but since we have grouped into ‘other’ it hardly seems worth it. Ultimately, I feel that ‘other’ grouping was the best way to go for this data.